%% This BibTeX bibliography file was created using BibDesk.
%% https://bibdesk.sourceforge.io/

%% Created for hankcs at 2022-04-15 10:32:15 -0400 


%% Saved with string encoding Unicode (UTF-8) 



@article{zhang2021mengzi,
	author = {Zhang, Zhuosheng and Zhang, Hanqing and Chen, Keming and Guo, Yuhang and Hua, Jingyun and Wang, Yulong and Zhou, Ming},
	date-added = {2022-04-15 10:32:14 -0400},
	date-modified = {2022-04-15 10:32:14 -0400},
	journal = {arXiv preprint arXiv:2110.06696},
	title = {Mengzi: Towards Lightweight yet Ingenious Pre-trained Models for Chinese},
	year = {2021}}

@inproceedings{samuel-straka-2020-ufal,
	abstract = {We present PERIN, a novel permutation-invariant approach to sentence-to-graph semantic parsing. PERIN is a versatile, cross-framework and language independent architecture for universal modeling of semantic structures. Our system participated in the CoNLL 2020 shared task, Cross-Framework Meaning Representation Parsing (MRP 2020), where it was evaluated on five different frameworks (AMR, DRG, EDS, PTG and UCCA) across four languages. PERIN was one of the winners of the shared task. The source code and pretrained models are available at http://www.github.com/ufal/perin.},
	address = {Online},
	author = {Samuel, David and Straka, Milan},
	booktitle = {Proceedings of the CoNLL 2020 Shared Task: Cross-Framework Meaning Representation Parsing},
	date-added = {2022-04-12 22:36:23 -0400},
	date-modified = {2022-04-12 22:36:23 -0400},
	doi = {10.18653/v1/2020.conll-shared.5},
	month = nov,
	pages = {53--64},
	publisher = {Association for Computational Linguistics},
	title = {{{\'U}FAL} at {MRP} 2020: Permutation-invariant Semantic Parsing in {PERIN}},
	url = {https://aclanthology.org/2020.conll-shared.5},
	year = {2020},
	bdsk-url-1 = {https://aclanthology.org/2020.conll-shared.5},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2020.conll-shared.5}}

@inproceedings{qiu-etal-2014-multi,
	address = {Dublin, Ireland},
	author = {Qiu, Likun and Zhang, Yue and Jin, Peng and Wang, Houfeng},
	booktitle = {Proceedings of {COLING} 2014, the 25th International Conference on Computational Linguistics: Technical Papers},
	date-added = {2022-02-15 04:42:58 -0500},
	date-modified = {2022-02-15 04:42:58 -0500},
	month = aug,
	pages = {257--268},
	publisher = {Dublin City University and Association for Computational Linguistics},
	title = {Multi-view {C}hinese Treebanking},
	url = {https://aclanthology.org/C14-1026},
	year = {2014},
	bdsk-url-1 = {https://aclanthology.org/C14-1026}}

@inproceedings{li-etal-2018-analogical,
	abstract = {Analogical reasoning is effective in capturing linguistic regularities. This paper proposes an analogical reasoning task on Chinese. After delving into Chinese lexical knowledge, we sketch 68 implicit morphological relations and 28 explicit semantic relations. A big and balanced dataset CA8 is then built for this task, including 17813 questions. Furthermore, we systematically explore the influences of vector representations, context features, and corpora on analogical reasoning. With the experiments, CA8 is proved to be a reliable benchmark for evaluating Chinese word embeddings.},
	address = {Melbourne, Australia},
	author = {Li, Shen and Zhao, Zhe and Hu, Renfen and Li, Wensi and Liu, Tao and Du, Xiaoyong},
	booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
	date-added = {2022-01-30 22:52:52 -0500},
	date-modified = {2022-01-30 22:52:52 -0500},
	doi = {10.18653/v1/P18-2023},
	month = jul,
	pages = {138--143},
	publisher = {Association for Computational Linguistics},
	title = {Analogical Reasoning on {C}hinese Morphological and Semantic Relations},
	url = {https://aclanthology.org/P18-2023},
	year = {2018},
	bdsk-url-1 = {https://aclanthology.org/P18-2023},
	bdsk-url-2 = {https://doi.org/10.18653/v1/P18-2023}}

@inproceedings{NIPS2013_9aa42b31,
	author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S and Dean, Jeff},
	booktitle = {Advances in Neural Information Processing Systems},
	date-added = {2022-01-30 18:17:28 -0500},
	date-modified = {2022-01-30 18:17:28 -0500},
	editor = {C. J. C. Burges and L. Bottou and M. Welling and Z. Ghahramani and K. Q. Weinberger},
	publisher = {Curran Associates, Inc.},
	title = {Distributed Representations of Words and Phrases and their Compositionality},
	url = {https://proceedings.neurips.cc/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf},
	volume = {26},
	year = {2013},
	bdsk-url-1 = {https://proceedings.neurips.cc/paper/2013/file/9aa42b31882ec039965f3c4923ce901b-Paper.pdf}}

@inproceedings{bevilacqua-etal-2021-one,
	author = {Bevilacqua, Michele and Blloshmi, Rexhina and Navigli, Roberto},
	booktitle = {Proceedings of AAAI},
	date-added = {2022-01-25 11:58:03 -0500},
	date-modified = {2022-01-25 11:58:03 -0500},
	title = {One {SPRING} to Rule Them Both: {S}ymmetric {AMR} Semantic Parsing and Generation without a Complex Pipeline},
	year = {2021}}

@inproceedings{lewis-etal-2020-bart,
	abstract = {We present BART, a denoising autoencoder for pretraining sequence-to-sequence models. BART is trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. It uses a standard Tranformer-based neural machine translation architecture which, despite its simplicity, can be seen as generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder), and other recent pretraining schemes. We evaluate a number of noising approaches, finding the best performance by both randomly shuffling the order of sentences and using a novel in-filling scheme, where spans of text are replaced with a single mask token. BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa on GLUE and SQuAD, and achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 3.5 ROUGE. BART also provides a 1.1 BLEU increase over a back-translation system for machine translation, with only target language pretraining. We also replicate other pretraining schemes within the BART framework, to understand their effect on end-task performance.},
	address = {Online},
	author = {Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Veselin and Zettlemoyer, Luke},
	booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
	date-added = {2022-01-25 11:56:10 -0500},
	date-modified = {2022-01-25 11:56:10 -0500},
	doi = {10.18653/v1/2020.acl-main.703},
	month = jul,
	pages = {7871--7880},
	publisher = {Association for Computational Linguistics},
	title = {{BART}: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension},
	url = {https://www.aclweb.org/anthology/2020.acl-main.703},
	year = {2020},
	bdsk-url-1 = {https://www.aclweb.org/anthology/2020.acl-main.703},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.703}}

@article{knight2014abstract,
	author = {Knight, Kevin and Baranescu, Lauren and Bonial, Claire and Georgescu, Madalina and Griffitt, Kira and Hermjakob, Ulf and Marcu, Daniel and Palmer, Martha and Schneifer, Nathan},
	date-added = {2022-01-25 11:54:11 -0500},
	date-modified = {2022-01-25 11:54:11 -0500},
	journal = {Web download},
	title = {Abstract meaning representation (amr) annotation release 1.0},
	year = {2014}}

@inproceedings{he-choi-2021-stem,
	abstract = {Multi-task learning with transformer encoders (MTL) has emerged as a powerful technique to improve performance on closely-related tasks for both accuracy and efficiency while a question still remains whether or not it would perform as well on tasks that are distinct in nature. We first present MTL results on five NLP tasks, POS, NER, DEP, CON, and SRL, and depict its deficiency over single-task learning. We then conduct an extensive pruning analysis to show that a certain set of attention heads get claimed by most tasks during MTL, who interfere with one another to fine-tune those heads for their own objectives. Based on this finding, we propose the Stem Cell Hypothesis to reveal the existence of attention heads naturally talented for many tasks that cannot be jointly trained to create adequate embeddings for all of those tasks. Finally, we design novel parameter-free probes to justify our hypothesis and demonstrate how attention heads are transformed across the five tasks during MTL through label analysis.},
	address = {Online and Punta Cana, Dominican Republic},
	author = {He, Han and Choi, Jinho D.},
	booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
	date-added = {2021-11-06 18:24:44 -0400},
	date-modified = {2021-11-06 18:24:44 -0400},
	month = nov,
	pages = {5555--5577},
	publisher = {Association for Computational Linguistics},
	title = {The Stem Cell Hypothesis: Dilemma behind Multi-Task Learning with Transformer Encoders},
	url = {https://aclanthology.org/2021.emnlp-main.451},
	year = {2021},
	bdsk-url-1 = {https://aclanthology.org/2021.emnlp-main.451}}

@inproceedings{he-choi-2019,
	abstract = {This paper presents new state-of-the-art models for three tasks, part-of-speech tagging, syntactic parsing, and semantic parsing, using the cutting-edge contextualized embedding framework known as BERT. For each task, we first replicate and simplify the current state-of-the-art approach to enhance its model efficiency. We then evaluate our simplified approaches on those three tasks using token embeddings generated by BERT. 12 datasets in both English and Chinese are used for our experiments. The BERT models outperform the previously best-performing models by 2.5\% on average (7.5\% for the most significant case). All models and source codes are available in public so that researchers can improve upon and utilize them to establish strong baselines for the next decade.},
	author = {Han He and Jinho Choi},
	booktitle = {The Thirty-Third International Flairs Conference},
	conference = {Florida Artificial Intelligence Research Society Conference},
	date-added = {2021-10-16 21:09:00 -0400},
	date-modified = {2021-10-16 21:09:00 -0400},
	keywords = {part-of-speech tagging, syntactic parsing, semantic parsing, Transformer, BERT},
	title = {Establishing Strong Baselines for the New Decade: Sequence Tagging, Syntactic and Semantic Parsing with BERT},
	url = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS20/paper/view/18438},
	year = {2020},
	bdsk-url-1 = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS20/paper/view/18438}}

@inproceedings{xiao-etal-2021-ernie,
	abstract = {Coarse-grained linguistic information, such as named entities or phrases, facilitates adequately representation learning in pre-training. Previous works mainly focus on extending the objective of BERT{'}s Masked Language Modeling (MLM) from masking individual tokens to contiguous sequences of n tokens. We argue that such contiguously masking method neglects to model the intra-dependencies and inter-relation of coarse-grained linguistic information. As an alternative, we propose ERNIE-Gram, an explicitly n-gram masking method to enhance the integration of coarse-grained information into pre-training. In ERNIE-Gram, n-grams are masked and predicted directly using explicit n-gram identities rather than contiguous sequences of n tokens. Furthermore, ERNIE-Gram employs a generator model to sample plausible n-gram identities as optional n-gram masks and predict them in both coarse-grained and fine-grained manners to enable comprehensive n-gram prediction and relation modeling. We pre-train ERNIE-Gram on English and Chinese text corpora and fine-tune on 19 downstream tasks. Experimental results show that ERNIE-Gram outperforms previous pre-training models like XLNet and RoBERTa by a large margin, and achieves comparable results with state-of-the-art methods. The source codes and pre-trained models have been released at https://github.com/PaddlePaddle/ERNIE.},
	address = {Online},
	author = {Xiao, Dongling and Li, Yu-Kun and Zhang, Han and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},
	booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
	date-added = {2021-09-04 14:09:52 -0400},
	date-modified = {2021-09-04 14:09:52 -0400},
	doi = {10.18653/v1/2021.naacl-main.136},
	month = jun,
	pages = {1702--1715},
	publisher = {Association for Computational Linguistics},
	title = {{ERNIE}-Gram: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding},
	url = {https://aclanthology.org/2021.naacl-main.136},
	year = {2021},
	bdsk-url-1 = {https://aclanthology.org/2021.naacl-main.136},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2021.naacl-main.136}}

@inproceedings{akbik-etal-2018-contextual,
	abstract = {Recent advances in language modeling using recurrent neural networks have made it viable to model language as distributions over characters. By learning to predict the next character on the basis of previous characters, such models have been shown to automatically internalize linguistic concepts such as words, sentences, subclauses and even sentiment. In this paper, we propose to leverage the internal states of a trained character language model to produce a novel type of word embedding which we refer to as contextual string embeddings. Our proposed embeddings have the distinct properties that they (a) are trained without any explicit notion of words and thus fundamentally model words as sequences of characters, and (b) are contextualized by their surrounding text, meaning that the same word will have different embeddings depending on its contextual use. We conduct a comparative evaluation against previous embeddings and find that our embeddings are highly useful for downstream tasks: across four classic sequence labeling tasks we consistently outperform the previous state-of-the-art. In particular, we significantly outperform previous work on English and German named entity recognition (NER), allowing us to report new state-of-the-art F1-scores on the CoNLL03 shared task. We release all code and pre-trained language models in a simple-to-use framework to the research community, to enable reproduction of these experiments and application of our proposed embeddings to other tasks: https://github.com/zalandoresearch/flair},
	address = {Santa Fe, New Mexico, USA},
	author = {Akbik, Alan and Blythe, Duncan and Vollgraf, Roland},
	booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
	date-added = {2021-09-01 13:10:59 -0400},
	date-modified = {2021-09-01 13:10:59 -0400},
	month = aug,
	pages = {1638--1649},
	publisher = {Association for Computational Linguistics},
	title = {Contextual String Embeddings for Sequence Labeling},
	url = {https://aclanthology.org/C18-1139},
	year = {2018},
	bdsk-url-1 = {https://aclanthology.org/C18-1139}}

@inproceedings{he-choi-2021-levi,
	abstract = {Coupled with biaffine decoders, transformers have been effectively adapted to text-to-graph transduction and achieved state-of-the-art performance on AMR parsing. Many prior works, however, rely on the biaffine decoder for either or both arc and label predictions although most features used by the decoder may be learned by the transformer already. This paper presents a novel approach to AMR parsing by combining heterogeneous data (tokens, concepts, labels) as one input to a transformer to learn attention, and use only attention matrices from the transformer to predict all elements in AMR graphs (concepts, arcs, labels). Although our models use significantly fewer parameters than the previous state-of-the-art graph parser, they show similar or better accuracy on AMR 2.0 and 3.0.},
	address = {Online},
	author = {He, Han and Choi, Jinho D.},
	booktitle = {Proceedings of the 17th International Conference on Parsing Technologies and the IWPT 2021 Shared Task on Parsing into Enhanced Universal Dependencies (IWPT 2021)},
	date-added = {2021-09-01 13:09:14 -0400},
	date-modified = {2021-09-01 13:09:14 -0400},
	doi = {10.18653/v1/2021.iwpt-1.5},
	month = aug,
	pages = {50--57},
	publisher = {Association for Computational Linguistics},
	title = {Levi Graph {AMR} Parser using Heterogeneous Attention},
	url = {https://aclanthology.org/2021.iwpt-1.5},
	year = {2021},
	bdsk-url-1 = {https://aclanthology.org/2021.iwpt-1.5},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2021.iwpt-1.5}}

@inproceedings{conneau-etal-2020-unsupervised,
	abstract = {This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +14.6{\%} average accuracy on XNLI, +13{\%} average F1 score on MLQA, and +2.4{\%} F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 15.7{\%} in XNLI accuracy for Swahili and 11.4{\%} for Urdu over previous XLM models. We also present a detailed empirical analysis of the key factors that are required to achieve these gains, including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing per-language performance; XLM-R is very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We will make our code and models publicly available.},
	address = {Online},
	author = {Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
	booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
	date-added = {2021-09-01 12:41:50 -0400},
	date-modified = {2021-09-01 12:41:50 -0400},
	doi = {10.18653/v1/2020.acl-main.747},
	month = jul,
	pages = {8440--8451},
	publisher = {Association for Computational Linguistics},
	title = {Unsupervised Cross-lingual Representation Learning at Scale},
	url = {https://aclanthology.org/2020.acl-main.747},
	year = {2020},
	bdsk-url-1 = {https://aclanthology.org/2020.acl-main.747},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.747}}

@inproceedings{xue-etal-2021-mt5,
	abstract = {The recent {``}Text-to-Text Transfer Transformer{''} (T5) leveraged a unified text-to-text format and scale to attain state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual benchmarks. We also describe a simple technique to prevent {``}accidental translation{''} in the zero-shot setting, where a generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model checkpoints used in this work are publicly available.},
	address = {Online},
	author = {Xue, Linting and Constant, Noah and Roberts, Adam and Kale, Mihir and Al-Rfou, Rami and Siddhant, Aditya and Barua, Aditya and Raffel, Colin},
	booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
	date-added = {2021-09-01 12:40:34 -0400},
	date-modified = {2021-09-01 12:40:34 -0400},
	doi = {10.18653/v1/2021.naacl-main.41},
	month = jun,
	pages = {483--498},
	publisher = {Association for Computational Linguistics},
	title = {m{T}5: A Massively Multilingual Pre-trained Text-to-Text Transformer},
	url = {https://aclanthology.org/2021.naacl-main.41},
	year = {2021},
	bdsk-url-1 = {https://aclanthology.org/2021.naacl-main.41},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2021.naacl-main.41}}

@misc{https://doi.org/10.35111/gvd0-xk91,
	author = {Xue, Nianwen and {Zhang, Xiuhong} and {Jiang, Zixin} and {Palmer, Martha} and {Xia, Fei} and {Chiou, Fu-Dong} and {Chang, Meiyu}},
	date-added = {2021-09-01 12:32:05 -0400},
	date-modified = {2021-09-01 12:36:22 -0400},
	doi = {10.35111/GVD0-XK91},
	publisher = {Linguistic Data Consortium},
	title = {Chinese Treebank 9.0},
	url = {https://catalog.ldc.upenn.edu/LDC2016T13},
	year = {2016},
	bdsk-url-1 = {https://catalog.ldc.upenn.edu/LDC2016T13},
	bdsk-url-2 = {https://doi.org/10.35111/GVD0-XK91}}

@inproceedings{clark2020electra,
	author = {Kevin Clark and Minh-Thang Luong and Quoc V. Le and Christopher D. Manning},
	booktitle = {ICLR},
	date-added = {2021-08-07 15:53:27 -0400},
	date-modified = {2021-08-07 15:53:27 -0400},
	title = {{ELECTRA}: Pre-training Text Encoders as Discriminators Rather Than Generators},
	url = {https://openreview.net/pdf?id=r1xMH1BtvB},
	year = {2020},
	bdsk-url-1 = {https://openreview.net/pdf?id=r1xMH1BtvB}}

@inproceedings{chang-etal-2009-discriminative,
	address = {Boulder, Colorado},
	author = {Chang, Pi-Chuan and Tseng, Huihsin and Jurafsky, Dan and Manning, Christopher D.},
	booktitle = {Proceedings of the Third Workshop on Syntax and Structure in Statistical Translation ({SSST}-3) at {NAACL} {HLT} 2009},
	date-added = {2021-03-17 13:37:03 -0400},
	date-modified = {2021-03-17 13:37:03 -0400},
	month = jun,
	pages = {51--59},
	publisher = {Association for Computational Linguistics},
	title = {Discriminative Reordering with {C}hinese Grammatical Relations Features},
	url = {https://www.aclweb.org/anthology/W09-2307},
	year = {2009},
	bdsk-url-1 = {https://www.aclweb.org/anthology/W09-2307}}

@inproceedings{pennington-etal-2014-glove,
	address = {Doha, Qatar},
	author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher},
	booktitle = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing ({EMNLP})},
	date-added = {2020-12-31 15:07:29 -0500},
	date-modified = {2020-12-31 15:07:29 -0500},
	doi = {10.3115/v1/D14-1162},
	month = oct,
	pages = {1532--1543},
	publisher = {Association for Computational Linguistics},
	title = {{G}lo{V}e: Global Vectors for Word Representation},
	url = {https://www.aclweb.org/anthology/D14-1162},
	year = {2014},
	bdsk-url-1 = {https://www.aclweb.org/anthology/D14-1162},
	bdsk-url-2 = {https://doi.org/10.3115/v1/D14-1162}}

@incollection{he2018dual,
	author = {He, Han and Wu, Lei and Yang, Xiaokun and Yan, Hua and Gao, Zhimin and Feng, Yi and Townsend, George},
	booktitle = {Information Technology-New Generations},
	date-added = {2020-12-31 15:03:58 -0500},
	date-modified = {2020-12-31 15:03:58 -0500},
	pages = {421--426},
	publisher = {Springer},
	title = {Dual long short-term memory networks for sub-character representation learning},
	year = {2018}}

@inproceedings{devlin-etal-2019-bert,
	abstract = {We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7{\%} (4.6{\%} absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).},
	address = {Minneapolis, Minnesota},
	author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
	booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
	date-added = {2020-12-31 14:46:54 -0500},
	date-modified = {2020-12-31 14:46:54 -0500},
	doi = {10.18653/v1/N19-1423},
	month = jun,
	pages = {4171--4186},
	publisher = {Association for Computational Linguistics},
	title = {{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding},
	url = {https://www.aclweb.org/anthology/N19-1423},
	year = {2019},
	bdsk-url-1 = {https://www.aclweb.org/anthology/N19-1423},
	bdsk-url-2 = {https://doi.org/10.18653/v1/N19-1423}}

@inproceedings{Lan2020ALBERT:,
	author = {Zhenzhong Lan and Mingda Chen and Sebastian Goodman and Kevin Gimpel and Piyush Sharma and Radu Soricut},
	booktitle = {International Conference on Learning Representations},
	date-added = {2020-12-31 14:44:52 -0500},
	date-modified = {2020-12-31 14:44:52 -0500},
	title = {ALBERT: A Lite BERT for Self-supervised Learning of Language Representations},
	url = {https://openreview.net/forum?id=H1eA7AEtvS},
	year = {2020},
	bdsk-url-1 = {https://openreview.net/forum?id=H1eA7AEtvS}}

@inproceedings{wang-xu-2017-convolutional,
	abstract = {Character-based sequence labeling framework is flexible and efficient for Chinese word segmentation (CWS). Recently, many character-based neural models have been applied to CWS. While they obtain good performance, they have two obvious weaknesses. The first is that they heavily rely on manually designed bigram feature, i.e. they are not good at capturing $n$-gram features automatically. The second is that they make no use of full word information. For the first weakness, we propose a convolutional neural model, which is able to capture rich $n$-gram features without any feature engineering. For the second one, we propose an effective approach to integrate the proposed model with word embeddings. We evaluate the model on two benchmark datasets: PKU and MSR. Without any feature engineering, the model obtains competitive performance {---} 95.7{\%} on PKU and 97.3{\%} on MSR. Armed with word embeddings, the model achieves state-of-the-art performance on both datasets {---} 96.5{\%} on PKU and 98.0{\%} on MSR, without using any external labeled resource.},
	address = {Taipei, Taiwan},
	author = {Wang, Chunqi and Xu, Bo},
	booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers)},
	date-added = {2020-12-31 14:42:35 -0500},
	date-modified = {2020-12-31 14:42:35 -0500},
	month = nov,
	pages = {163--172},
	publisher = {Asian Federation of Natural Language Processing},
	title = {Convolutional Neural Network with Word Embeddings for {C}hinese Word Segmentation},
	url = {https://www.aclweb.org/anthology/I17-1017},
	year = {2017},
	bdsk-url-1 = {https://www.aclweb.org/anthology/I17-1017}}

@article{bojanowski2017enriching,
	author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
	date-added = {2020-12-25 22:31:59 -0500},
	date-modified = {2020-12-25 22:31:59 -0500},
	issn = {2307-387X},
	journal = {Transactions of the Association for Computational Linguistics},
	pages = {135--146},
	title = {Enriching Word Vectors with Subword Information},
	volume = {5},
	year = {2017}}

@article{collins-koo-2005-discriminative,
	author = {Collins, Michael and Koo, Terry},
	date-added = {2020-12-25 17:25:59 -0500},
	date-modified = {2020-12-25 17:25:59 -0500},
	doi = {10.1162/0891201053630273},
	journal = {Computational Linguistics},
	number = {1},
	pages = {25--70},
	title = {Discriminative Reranking for Natural Language Parsing},
	url = {https://www.aclweb.org/anthology/J05-1003},
	volume = {31},
	year = {2005},
	bdsk-url-1 = {https://www.aclweb.org/anthology/J05-1003},
	bdsk-url-2 = {https://doi.org/10.1162/0891201053630273}}

@inproceedings{zhang-clark-2008-tale,
	address = {Honolulu, Hawaii},
	author = {Zhang, Yue and Clark, Stephen},
	booktitle = {Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing},
	date-added = {2020-12-25 15:10:10 -0500},
	date-modified = {2020-12-25 15:10:10 -0500},
	month = oct,
	pages = {562--571},
	publisher = {Association for Computational Linguistics},
	title = {A Tale of Two Parsers: {I}nvestigating and Combining Graph-based and Transition-based Dependency Parsing},
	url = {https://www.aclweb.org/anthology/D08-1059},
	year = {2008},
	bdsk-url-1 = {https://www.aclweb.org/anthology/D08-1059}}

@inproceedings{pradhan-etal-2012-conll,
	address = {Jeju Island, Korea},
	author = {Pradhan, Sameer and Moschitti, Alessandro and Xue, Nianwen and Uryupina, Olga and Zhang, Yuchen},
	booktitle = {Joint Conference on {EMNLP} and {C}o{NLL} - Shared Task},
	date-added = {2020-12-24 23:42:41 -0500},
	date-modified = {2020-12-24 23:42:41 -0500},
	month = jul,
	pages = {1--40},
	publisher = {Association for Computational Linguistics},
	title = {{C}o{NLL}-2012 Shared Task: Modeling Multilingual Unrestricted Coreference in {O}nto{N}otes},
	url = {https://www.aclweb.org/anthology/W12-4501},
	year = {2012},
	bdsk-url-1 = {https://www.aclweb.org/anthology/W12-4501}}

@inproceedings{levow-2006-third,
	address = {Sydney, Australia},
	author = {Levow, Gina-Anne},
	booktitle = {Proceedings of the Fifth {SIGHAN} Workshop on {C}hinese Language Processing},
	date-added = {2020-12-24 23:21:14 -0500},
	date-modified = {2020-12-24 23:21:14 -0500},
	month = jul,
	pages = {108--117},
	publisher = {Association for Computational Linguistics},
	title = {The Third International {C}hinese Language Processing Bakeoff: Word Segmentation and Named Entity Recognition},
	url = {https://www.aclweb.org/anthology/W06-0115},
	year = {2006},
	bdsk-url-1 = {https://www.aclweb.org/anthology/W06-0115}}

@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,
	author = {Tjong Kim Sang, Erik F. and De Meulder, Fien},
	booktitle = {Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003},
	date-added = {2020-12-24 23:19:00 -0500},
	date-modified = {2020-12-24 23:19:00 -0500},
	pages = {142--147},
	title = {Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition},
	url = {https://www.aclweb.org/anthology/W03-0419},
	year = {2003},
	bdsk-url-1 = {https://www.aclweb.org/anthology/W03-0419}}

@inproceedings{koehn2005europarl,
	author = {Koehn, Philipp},
	booktitle = {MT summit},
	date-added = {2020-12-24 23:06:03 -0500},
	date-modified = {2020-12-24 23:06:03 -0500},
	organization = {Citeseer},
	pages = {79--86},
	title = {Europarl: A parallel corpus for statistical machine translation},
	volume = {5},
	year = {2005}}

@inproceedings{Schweter:Ahmed:2019,
	author = {Stefan Schweter and Sajawel Ahmed},
	booktitle = {Proceedings of the 15th Conference on Natural Language Processing (KONVENS)},
	date-added = {2020-12-24 23:03:23 -0500},
	date-modified = {2020-12-24 23:03:23 -0500},
	location = {Erlangen, Germany},
	note = {accepted},
	title = {{Deep-EOS: General-Purpose Neural Networks for Sentence Boundary Detection}},
	year = 2019}

@incollection{he2019effective,
	author = {He, Han and Wu, Lei and Yan, Hua and Gao, Zhimin and Feng, Yi and Townsend, George},
	booktitle = {Smart Intelligent Computing and Applications},
	date-added = {2020-12-24 19:35:03 -0500},
	date-modified = {2020-12-24 19:35:03 -0500},
	pages = {133--142},
	publisher = {Springer},
	title = {Effective neural solution for multi-criteria word segmentation},
	year = {2019}}

@inproceedings{dozat2017stanford,
	author = {Dozat, Timothy and Qi, Peng and Manning, Christopher D},
	booktitle = {Proceedings of the CoNLL 2017 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies},
	date-added = {2020-12-24 15:02:18 -0500},
	date-modified = {2020-12-24 15:02:18 -0500},
	pages = {20--30},
	title = {Stanford's graph-based neural dependency parser at the conll 2017 shared task},
	year = {2017}}

@inproceedings{he-etal-2018-jointly,
	abstract = {Recent BIO-tagging-based neural semantic role labeling models are very high performing, but assume gold predicates as part of the input and cannot incorporate span-level features. We propose an end-to-end approach for jointly predicting all predicates, arguments spans, and the relations between them. The model makes independent decisions about what relationship, if any, holds between every possible word-span pair, and learns contextualized span representations that provide rich, shared input features for each decision. Experiments demonstrate that this approach sets a new state of the art on PropBank SRL without gold predicates.},
	address = {Melbourne, Australia},
	author = {He, Luheng and Lee, Kenton and Levy, Omer and Zettlemoyer, Luke},
	booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
	date-added = {2020-12-24 14:23:45 -0500},
	date-modified = {2020-12-24 14:23:45 -0500},
	doi = {10.18653/v1/P18-2058},
	month = jul,
	pages = {364--369},
	publisher = {Association for Computational Linguistics},
	title = {Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling},
	url = {https://www.aclweb.org/anthology/P18-2058},
	year = {2018},
	bdsk-url-1 = {https://www.aclweb.org/anthology/P18-2058},
	bdsk-url-2 = {https://doi.org/10.18653/v1/P18-2058}}

@inproceedings{yu-etal-2020-named,
	abstract = {Named Entity Recognition (NER) is a fundamental task in Natural Language Processing, concerned with identifying spans of text expressing references to entities. NER research is often focused on flat entities only (flat NER), ignoring the fact that entity references can be nested, as in [Bank of [China]] (Finkel and Manning, 2009). In this paper, we use ideas from graph-based dependency parsing to provide our model a global view on the input via a biaffine model (Dozat and Manning, 2017). The biaffine model scores pairs of start and end tokens in a sentence which we use to explore all spans, so that the model is able to predict named entities accurately. We show that the model works well for both nested and flat NER through evaluation on 8 corpora and achieving SoTA performance on all of them, with accuracy gains of up to 2.2 percentage points.},
	address = {Online},
	author = {Yu, Juntao and Bohnet, Bernd and Poesio, Massimo},
	booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
	date-added = {2020-12-24 13:35:09 -0500},
	date-modified = {2020-12-24 13:35:09 -0500},
	doi = {10.18653/v1/2020.acl-main.577},
	month = jul,
	pages = {6470--6476},
	publisher = {Association for Computational Linguistics},
	title = {Named Entity Recognition as Dependency Parsing},
	url = {https://www.aclweb.org/anthology/2020.acl-main.577},
	year = {2020},
	bdsk-url-1 = {https://www.aclweb.org/anthology/2020.acl-main.577},
	bdsk-url-2 = {https://doi.org/10.18653/v1/2020.acl-main.577}}

@inproceedings{10.1145/1457838.1457895,
	abstract = {Many computer applications require the storage of large amounts of information within the computer's memory where it will be readily available for reference and updating. Quite commonly, more storage space is required than is available in the computer's high-speed working memory. It is, therefore, a common practice to equip computers with magnetic tapes, disks, or drums, or a combination of these to provide additional storage. This additional storage is always slower in operation than the computer's working memory and therefore care must be taken when using it to avoid excessive operating time.},
	address = {New York, NY, USA},
	author = {De La Briandais, Rene},
	booktitle = {Papers Presented at the the March 3-5, 1959, Western Joint Computer Conference},
	date-added = {2020-12-24 13:07:31 -0500},
	date-modified = {2020-12-24 13:07:31 -0500},
	doi = {10.1145/1457838.1457895},
	isbn = {9781450378659},
	location = {San Francisco, California},
	numpages = {4},
	pages = {295--298},
	publisher = {Association for Computing Machinery},
	series = {IRE-AIEE-ACM '59 (Western)},
	title = {File Searching Using Variable Length Keys},
	url = {https://doi.org/10.1145/1457838.1457895},
	year = {1959},
	bdsk-url-1 = {https://doi.org/10.1145/1457838.1457895}}

@article{lafferty2001conditional,
	author = {Lafferty, John and McCallum, Andrew and Pereira, Fernando CN},
	date-added = {2020-12-24 11:46:30 -0500},
	date-modified = {2020-12-24 12:08:29 -0500},
	journal = {Departmental Papers (CIS)},
	title = {Conditional random fields: Probabilistic models for segmenting and labeling sequence data},
	year = {2001}}

@inproceedings{clark-etal-2019-bam,
	abstract = {It can be challenging to train multi-task neural networks that outperform or even match their single-task counterparts. To help address this, we propose using knowledge distillation where single-task models teach a multi-task model. We enhance this training with teacher annealing, a novel method that gradually transitions the model from distillation to supervised learning, helping the multi-task model surpass its single-task teachers. We evaluate our approach by multi-task fine-tuning BERT on the GLUE benchmark. Our method consistently improves over standard single-task and multi-task training.},
	address = {Florence, Italy},
	author = {Clark, Kevin and Luong, Minh-Thang and Khandelwal, Urvashi and Manning, Christopher D. and Le, Quoc V.},
	booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
	date-added = {2020-12-24 11:26:54 -0500},
	date-modified = {2020-12-24 11:26:54 -0500},
	doi = {10.18653/v1/P19-1595},
	month = jul,
	pages = {5931--5937},
	publisher = {Association for Computational Linguistics},
	title = {{BAM}! Born-Again Multi-Task Networks for Natural Language Understanding},
	url = {https://www.aclweb.org/anthology/P19-1595},
	year = {2019},
	bdsk-url-1 = {https://www.aclweb.org/anthology/P19-1595},
	bdsk-url-2 = {https://doi.org/10.18653/v1/P19-1595}}

@inproceedings{kondratyuk-straka-2019-75,
	address = {Hong Kong, China},
	author = {Kondratyuk, Dan and Straka, Milan},
	booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
	date-added = {2020-12-23 23:51:07 -0500},
	date-modified = {2020-12-23 23:51:07 -0500},
	pages = {2779--2795},
	publisher = {Association for Computational Linguistics},
	title = {75 Languages, 1 Model: Parsing Universal Dependencies Universally},
	url = {https://www.aclweb.org/anthology/D19-1279},
	year = {2019},
	bdsk-url-1 = {https://www.aclweb.org/anthology/D19-1279}}

@inproceedings{dozat:17a,
	author = {Dozat, Timothy and Manning, Christopher D.},
	booktitle = {Proceedings of the 5th International Conference on Learning Representations},
	date-added = {2020-12-23 23:46:20 -0500},
	date-modified = {2020-12-23 23:46:20 -0500},
	series = {ICLR'17},
	title = {{Deep Biaffine Attention for Neural Dependency Parsing}},
	url = {https://openreview.net/pdf?id=Hk95PK9le},
	year = {2017},
	bdsk-url-1 = {http://arxiv.org/abs/1611.01734},
	bdsk-url-2 = {https://openreview.net/pdf?id=Hk95PK9le}}

@inproceedings{smith-smith-2007-probabilistic,
	address = {Prague, Czech Republic},
	author = {Smith, David A. and Smith, Noah A.},
	booktitle = {Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning ({EMNLP}-{C}o{NLL})},
	date-added = {2020-12-23 21:46:06 -0500},
	date-modified = {2020-12-23 21:46:06 -0500},
	month = jun,
	pages = {132--140},
	publisher = {Association for Computational Linguistics},
	title = {Probabilistic Models of Nonprojective Dependency Trees},
	url = {https://www.aclweb.org/anthology/D07-1014},
	year = {2007},
	bdsk-url-1 = {https://www.aclweb.org/anthology/D07-1014}}

@inproceedings{ijcai2020-560,
	author = {Zhang, Yu and Zhou, Houquan and Li, Zhenghua},
	booktitle = {Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence, {IJCAI-20}},
	date-added = {2020-12-23 21:36:56 -0500},
	date-modified = {2020-12-23 21:36:56 -0500},
	doi = {10.24963/ijcai.2020/560},
	editor = {Christian Bessiere},
	month = {7},
	note = {Main track},
	pages = {4046--4053},
	publisher = {International Joint Conferences on Artificial Intelligence Organization},
	title = {Fast and Accurate Neural CRF Constituency Parsing},
	url = {https://doi.org/10.24963/ijcai.2020/560},
	year = {2020},
	bdsk-url-1 = {https://doi.org/10.24963/ijcai.2020/560}}

@inproceedings{buchholz-marsi-2006-conll,
	address = {New York City},
	author = {Buchholz, Sabine and Marsi, Erwin},
	booktitle = {Proceedings of the Tenth Conference on Computational Natural Language Learning ({C}o{NLL}-X)},
	date-added = {2020-12-22 22:57:41 -0500},
	date-modified = {2020-12-22 22:57:41 -0500},
	month = jun,
	pages = {149--164},
	publisher = {Association for Computational Linguistics},
	title = {{C}o{NLL}-{X} Shared Task on Multilingual Dependency Parsing},
	url = {https://www.aclweb.org/anthology/W06-2920},
	year = {2006},
	bdsk-url-1 = {https://www.aclweb.org/anthology/W06-2920}}
